#########################################

## Group Identities and Parliamentary Debates: Replication package
## Fiva, Nedregård and Øien (2025)

# Description:

## This code use monte carlo simulation to draw 200 words from the estimated
## multinomial speaker model.
## This code is based on ~/source/analysis/magnitude_montecarlo/code.R
## from the replication package of Gentzkow et al. (2019)
## available here: https://onlinelibrary.wiley.com/doi/abs/10.3982/ECTA16566

#########################################

library(data.table)
library(dplyr)
library(stringr)
library(Matrix)





data.dir <- "../data/2_processed_data"
dir   <- "../data/3_model_output"

################################################################################

######### BLOC #################################################

################################################################################

# Loading the probabilities for multinomial model


q <- readRDS(file = paste(dir, "q_bloc_80_2021.rds", sep = "/"))



speaker_metadata <- fread(file = 
                          paste(data.dir, "speeches_session_lemma.csv", sep = "/"), 
                          drop = "text", encoding="UTF-8")


## Sessions

sessions <- unique(speaker_metadata$session)


## using all sessions
sess <- sort(sessions)

phrase_seq_list <- vector("list", length(sessions))
names(phrase_seq_list) <- sessions

magnitude_list <- vector("list", length(sessions))
names(magnitude_list) <- vector("list", length(sessions))

# number of words 

J <- 200

options(warn = 1)


for (s in sess){
  
  print(s)
  
  set.seed(which(sess %in% s))
  
  id_sess <- unique(speaker_metadata[session == s,]$pid_session)
  
  # Q_i_j Is the probability of speaker i (speaker 1 to nrow(Q)/2 are original speakers
  # while others are the speaker clones) uttering word j
  Q <- q[rownames(q) %in% id_sess,]
  
  # Step 1. Original speakers and meta data 
  ## the first half is the original speakers and the second half are the clones
  
  orig_spkrs  <- rownames(Q)[1: (nrow(Q)/2)]
  ## draw random party for the original speakers 
  party       <- rbinom(length(orig_spkrs), 1, .5)
  ## Meta data on the original speakers
  speaker_metadata[, party_id_numeric := ifelse(bloc == "H", 1, 0)]
  meta <- speaker_metadata[which(speaker_metadata$pid_session %in% orig_spkrs), ]
  
  # Indicating whether randomized party matches true party for selecting 
  # the appropriate model Qs
  
  orig_ind    <- as.numeric(party == meta$party_id_numeric) 
  
  # generate q vector, taking true rows from Q if orig_ind == 1 and random rows otherwise
  q_party     <- Q[1:length(orig_spkrs), ] * orig_ind + 
    Q[(length(orig_spkrs) + 1):(2 * length(orig_spkrs)), ] * (1 - orig_ind)
  
  phrase_seq <- matrix(NA, ncol = J, nrow = nrow(q_party))
  for(j in 1:J){
    # for each spkr, draw 100 phrases from MN(200, q_party)
    # we are drawing one and one word per speaker
    phrase_seq[, j] <- sapply(1:nrow(q_party), 
                              function(i) colnames(q_party)[
                                which(as.logical(rmultinom(1, 1, q_party[i, ])))])
  }
  
  phrase_seq_list[[glue::glue("{s}")]] <- phrase_seq
  
  # extract q for H and V separately
  true_party  <- meta$party_id_numeric
  true_party  <- c(true_party, (1 - true_party))
  q_H         <- Q[which(true_party == 1), ]
  q_V         <- Q[which(true_party == 0), ]
  q_H         <- q_H[match(rownames(q_party), rownames(q_H)), ]
  q_V         <- q_V[match(rownames(q_party), rownames(q_V)), ]
  
  # compute expected posteriors
  rho        <- matrix(NA, ncol = J, nrow = nrow(q_party))
  rho_temp   <- rep(.5, length(orig_spkrs)) # neutral priors
  for(j in 1:J){
    # collect probability distribution over drawn phrases
    q_h_temp <- diag(q_H[, phrase_seq[, j]]) # spkr-row j has drawn phrase j in this construction
    q_v_temp <- diag(q_V[, phrase_seq[, j]]) 
    # expected posteriors
    rho[, j] <- (rho_temp * q_h_temp) / (rho_temp * q_h_temp + (1 - rho_temp) * q_v_temp)
    rho_temp <- rho[, j]
  }
  
  
  # partisanship given expected posteriors
  pi     <- party * rho + (1 - party) * (1 - rho)
  pi_avg  <- apply(pi, 2, mean)
  
  sess_name <- paste("pi_avg_sess", s, sep = "_")
  
  sess_name <- str_replace_all(sess_name, "-", "_")
  
  assign(sess_name, pi_avg)
  
  
  magnitude_list[[glue::glue("{s}")]] <- pi_avg
  
  
}

magnitude_list <- magnitude_list[!sapply(magnitude_list, is.null)]


saveRDS(magnitude_list, file = paste(dir, "magnitude_list_bloc.RDS", sep = "/"))

################################################################################

######### GENDER ###############################################################

################################################################################


# Loading the probabilities for multinomial model



q <- readRDS(file = paste(dir, "q_gender_80_2021.rds", sep = "/"))


speaker_metadata <- fread(file = 
                            paste(data.dir, "speeches_session_lemma.csv", sep = "/"), 
                          drop = "text", encoding="UTF-8")

## Sessions

sessions <- unique(speaker_metadata$session)




## using all sessions

sess <- sessions





phrase_seq_list <- vector("list", length(sessions))
names(phrase_seq_list) <- sessions

magnitude_list <- vector("list", length(sessions))
names(magnitude_list) <- vector("list", length(sessions))

# Number of words to simulate

J <- 200


for (s in sess){
  
  print(s)
  
  set.seed(which(sess %in% s))
  
  
  id_sess <- unique(speaker_metadata[session == s,]$pid_session)
  
  

  Q <- q[rownames(q) %in% id_sess,]
  
  # Step 1. Original speakers and meta data 
  ## the first half is the original speakers and the second half are the clones
  
  orig_spkrs  <- rownames(Q)[1: (nrow(Q)/2)]
  ## draw random party for the original speakers # Here Party is gender
  party       <- rbinom(length(orig_spkrs), 1, .5)
  ## Meta data on the original speakers
  speaker_metadata[, party_id_numeric := ifelse(female == 1, 1, 0)]
  meta <- speaker_metadata[which(speaker_metadata$pid_session %in% orig_spkrs), ]
  
  # Indicating whether randomized party matches true party for selecting 
  # the appropriate model Qs
  
  orig_ind    <- as.numeric(party == meta$party_id_numeric) 
  
  # generate q vector, taking true rows from Q if orig_ind == 1 and random rows otherwise
  q_party     <- Q[1:length(orig_spkrs), ] * orig_ind + 
    Q[(length(orig_spkrs) + 1):(2 * length(orig_spkrs)), ] * (1 - orig_ind)
  
  phrase_seq <- matrix(NA, ncol = J, nrow = nrow(q_party))
  
  for(j in 1:J){
    #     set.seed(j)
    # for each spkr, draw 200 phrases from MN(200, q_party)
    # we are drawing one and one word per speaker
    phrase_seq[, j] <- sapply(1:nrow(q_party), 
                              function(i) colnames(q_party)[
                                which(as.logical(rmultinom(1, 1, q_party[i, ])))])
  }
  
  
  phrase_seq_list[[glue::glue("{s}")]] <- phrase_seq
  
  
  # extract q for H and V separately
  true_party  <- meta$party_id_numeric
  true_party  <- c(true_party, (1 - true_party))
  q_H         <- Q[which(true_party == 1), ]
  q_V         <- Q[which(true_party == 0), ]
  q_H         <- q_H[match(rownames(q_party), rownames(q_H)), ]
  q_V         <- q_V[match(rownames(q_party), rownames(q_V)), ]
  
  # compute expected posteriors
  rho        <- matrix(NA, ncol = J, nrow = nrow(q_party))
  rho_temp   <- rep(.5, length(orig_spkrs)) # neutral priors
  for(j in 1:J){
    # collect probability distribution over drawn phrases
    q_h_temp <- diag(q_H[, phrase_seq[, j]]) # spkr-row j has drawn phrase j in this construction
    q_v_temp <- diag(q_V[, phrase_seq[, j]]) 
    # expected posteriors
    rho[, j] <- (rho_temp * q_h_temp) / (rho_temp * q_h_temp + (1 - rho_temp) * q_v_temp)
    rho_temp <- rho[, j]
  }
  
  
  
  # partisanship given expected posteriors
  pi     <- party * rho + (1 - party) * (1 - rho)
  pi_avg  <- apply(pi, 2, mean)
  
  sess_name <- paste("pi_avg_sess", s, sep = "_")
  
  sess_name <- str_replace_all(sess_name, "-", "_")
  
  assign(sess_name, pi_avg)
  
  magnitude_list[[glue::glue("{s}")]] <- pi_avg
  
  
  
  
}


magnitude_list <- magnitude_list[!sapply(magnitude_list, is.null)]



saveRDS(magnitude_list, file = paste(dir, "magnitude_list_gender.RDS", sep = "/"))

################################################################################

######### AGE #################################################

################################################################################

# Loading the probabilities for multinomial model



q <- readRDS(file = paste(dir, "q_age_80_2021.rds", sep = "/"))




sessions <- unique(speaker_metadata$session)


## using all sessions
sess <- sort(sessions)


# data lists to store results

phrase_seq_list <- vector("list", length(sessions))
names(phrase_seq_list) <- sessions

magnitude_list <- vector("list", length(sessions))
names(magnitude_list) <- vector("list", length(sessions))


# Setting the number of words

J <- 200

options(warn = 1)

for (s in sess){
  

  print(s)
  
  set.seed(which(s == sess))
  
  id_sess <- unique(speaker_metadata[session == s,]$pid_session)
  

  Q <- q[rownames(q) %in% id_sess,]
  
  # Step 1. Original speakers and meta data 
  ## the first half is the original speakers and the second half are the clones
  
  orig_spkrs  <- rownames(Q)[1: (nrow(Q)/2)]
  ## draw random party for the original speakers 
  party       <- rbinom(length(orig_spkrs), 1, .5)
  ## Meta data on the original speakers
  speaker_metadata[, party_id_numeric := ifelse(age_cat == "old", 1, 0)]
  
  
  
  meta <- speaker_metadata[which(speaker_metadata$pid_session %in% orig_spkrs), ]
  
  # Indicating whether randomized party matches true party for selecting 
  # the appropriate model Qs
  
  orig_ind    <- as.numeric(party == meta$party_id_numeric) 
  
  # generate q vector, taking true rows from Q if orig_ind == 1 and random rows otherwise
  q_party     <- Q[1:length(orig_spkrs), ] * orig_ind + 
    Q[(length(orig_spkrs) + 1):(2 * length(orig_spkrs)), ] * (1 - orig_ind)
  
  phrase_seq <- matrix(NA, ncol = J, nrow = nrow(q_party))
  
  
  
  for(j in 1:J){
    # we are drawing one and one word per speaker
    phrase_seq[, j] <- sapply(1:nrow(q_party), 
                              function(i) colnames(q_party)[
                                which(as.logical(rmultinom(1, 1, q_party[i, ])))])
  }
  
  phrase_seq_list[[glue::glue("{s}")]] <- phrase_seq
  
  
  # extract q for H and V separately
  true_party  <- meta$party_id_numeric
  true_party  <- c(true_party, (1 - true_party))
  q_H         <- Q[which(true_party == 1), ]
  q_V         <- Q[which(true_party == 0), ]
  q_H         <- q_H[match(rownames(q_party), rownames(q_H)), ]
  q_V         <- q_V[match(rownames(q_party), rownames(q_V)), ]
  
  # compute expected posteriors
  rho        <- matrix(NA, ncol = J, nrow = nrow(q_party))
  rho_temp   <- rep(.5, length(orig_spkrs)) # neutral priors
  for(j in 1:J){
    # collect probability distribution over drawn phrases
    #    print(j)
    q_h_temp <- diag(q_H[, phrase_seq[, j]]) # spkr-row j has drawn phrase j in this construction
    q_v_temp <- diag(q_V[, phrase_seq[, j]]) 
    # expected posteriors
    rho[, j] <- (rho_temp * q_h_temp) / (rho_temp * q_h_temp + (1 - rho_temp) * q_v_temp)
    rho_temp <- rho[, j]
  }
  
  
  
  # partisanship given expected posteriors
  pi     <- party * rho + (1 - party) * (1 - rho)
  pi_avg  <- apply(pi, 2, mean)
  
  sess_name <- paste("pi_avg_sess", s, sep = "_")
  
  sess_name <- str_replace_all(sess_name, "-", "_")
  
  assign(sess_name, pi_avg)
  
  
  
  magnitude_list[[glue::glue("{s}")]] <- pi_avg
  
  
  
}



magnitude_list <- magnitude_list[!sapply(magnitude_list, is.null)]


saveRDS(magnitude_list, file = paste(dir, "magnitude_list_age.RDS", sep = "/"))

################################################################################

######### URBAN #################################################

################################################################################

# Loading the probabilities for multinomial model



q <- readRDS(file = paste(dir, "q_town_80_2021.rds", sep = "/"))




## Sessions

sessions <- unique(speaker_metadata$session)


## using all sessions

sess <- sort(sessions)

## Number of words

J <- 200



phrase_seq_list <- vector("list", length(sessions))
names(phrase_seq_list) <- sessions

magnitude_list <- vector("list", length(sessions))
names(magnitude_list) <- vector("list", length(sessions))


for (s in sess){
  

  print(s)

  set.seed(which(sess %in% s))
  
  
  id_sess <- unique(speaker_metadata[session == s,]$pid_session)
  
  

  Q <- q[rownames(q) %in% id_sess,]
  
  # Step 1. Original speakers and meta data 
  ## the first half is the original speakers and the second half are the clones
  
  orig_spkrs  <- rownames(Q)[1: (nrow(Q)/2)]
  ## draw random party for the original speakers # Here Party is urban indicator
  party       <- rbinom(length(orig_spkrs), 1, .5)
  ## Meta data on the original speakers
  speaker_metadata[, party_id_numeric := ifelse(town == "urban", 1, 0)]
  meta <- speaker_metadata[which(speaker_metadata$pid_session %in% orig_spkrs), ]
  
  # Indicating whether randomized party matches true party for selecting 
  # the appropriate model Qs
  
  orig_ind    <- as.numeric(party == meta$party_id_numeric) 
  
  # generate q vector, taking true rows from Q if orig_ind == 1 and random rows otherwise
  q_party     <- Q[1:length(orig_spkrs), ] * orig_ind + 
    Q[(length(orig_spkrs) + 1):(2 * length(orig_spkrs)), ] * (1 - orig_ind)
  
  phrase_seq <- matrix(NA, ncol = J, nrow = nrow(q_party))
  for(j in 1:J){

    phrase_seq[, j] <- sapply(1:nrow(q_party), 
                              function(i) colnames(q_party)[
                                which(as.logical(rmultinom(1, 1, q_party[i, ])))])
  }
  
  
  phrase_seq_list[[glue::glue("{s}")]] <- phrase_seq
  
  
  # extract q for H and V separately
  true_party  <- meta$party_id_numeric
  true_party  <- c(true_party, (1 - true_party))
  q_H         <- Q[which(true_party == 1), ]
  q_V         <- Q[which(true_party == 0), ]
  q_H         <- q_H[match(rownames(q_party), rownames(q_H)), ]
  q_V         <- q_V[match(rownames(q_party), rownames(q_V)), ]
  
  # compute expected posteriors
  rho        <- matrix(NA, ncol = J, nrow = nrow(q_party))
  rho_temp   <- rep(.5, length(orig_spkrs)) # neutral priors
  for(j in 1:J){
    # collect probability distribution over drawn phrases
    q_h_temp <- diag(q_H[, phrase_seq[, j]]) # spkr-row j has drawn phrase j in this construction
    q_v_temp <- diag(q_V[, phrase_seq[, j]]) 
    # expected posteriors
    rho[, j] <- (rho_temp * q_h_temp) / (rho_temp * q_h_temp + (1 - rho_temp) * q_v_temp)
    rho_temp <- rho[, j]
  }
  
  
  
  # partisanship given expected posteriors
  pi     <- party * rho + (1 - party) * (1 - rho)
  pi_avg  <- apply(pi, 2, mean)
  
  sess_name <- paste("pi_avg_sess", s, sep = "_")
  
  sess_name <- str_replace_all(sess_name, "-", "_")
  
  assign(sess_name, pi_avg)
  
  magnitude_list[[glue::glue("{s}")]] <- pi_avg
  
  

  
}


magnitude_list <- magnitude_list[!sapply(magnitude_list, is.null)]



saveRDS(magnitude_list, file = paste(dir, "magnitude_list_town.RDS", sep = "/"))


################################################################################

######### Father's Background #################################################

################################################################################

# Loading the probabilities for multinomial model


q <- readRDS(file = paste(dir, "q_occ_80_2021.rds", sep = "/"))




## Sessions

sessions <- unique(speaker_metadata$session)


## using all sessions

sess <- sort(sessions)

## Number of words

J <- 200

phrase_seq_list <- vector("list", length(sessions))
names(phrase_seq_list) <- sessions

magnitude_list <- vector("list", length(sessions))
names(magnitude_list) <- vector("list", length(sessions))

options(warn = 1)


for (s in sess){
  
  print(s)
  
  set.seed(which(sess %in% s))
  
  

  id_sess <- unique(speaker_metadata[session == s,]$pid_session)
  
  

  Q <- q[rownames(q) %in% id_sess,]
  
  # Step 1. Original speakers and meta data 
  ## the first half is the original speakers and the second half are the clones
  
  orig_spkrs  <- rownames(Q)[1: (nrow(Q)/2)]
  ## draw random party for the original speakers # Here Party is social background
  party       <- rbinom(length(orig_spkrs), 1, .5)
  ## Meta data on the original speakers
  speaker_metadata[, party_id_numeric := ifelse(occupation == "white", 1, 0)]
  meta <- speaker_metadata[which(speaker_metadata$pid_session %in% orig_spkrs), ]
  
  # Indicating whether randomized party matches true party for selecting 
  # the appropriate model Qs
  
  orig_ind    <- as.numeric(party == meta$party_id_numeric) 
  
  # generate q vector, taking true rows from Q if orig_ind == 1 and random rows otherwise
  q_party     <- Q[1:length(orig_spkrs), ] * orig_ind + 
    Q[(length(orig_spkrs) + 1):(2 * length(orig_spkrs)), ] * (1 - orig_ind)
  
  phrase_seq <- matrix(NA, ncol = J, nrow = nrow(q_party))
  for(j in 1:J){

    phrase_seq[, j] <- sapply(1:nrow(q_party), 
                              function(i) colnames(q_party)[
                                which(as.logical(rmultinom(1, 1, q_party[i, ])))])
  }
  
  
  phrase_seq_list[[glue::glue("{s}")]] <- phrase_seq
  
  
  # extract q for H and V separately
  true_party  <- meta$party_id_numeric
  true_party  <- c(true_party, (1 - true_party))
  q_H         <- Q[which(true_party == 1), ]
  q_V         <- Q[which(true_party == 0), ]
  q_H         <- q_H[match(rownames(q_party), rownames(q_H)), ]
  q_V         <- q_V[match(rownames(q_party), rownames(q_V)), ]
  
  # compute expected posteriors
  rho        <- matrix(NA, ncol = J, nrow = nrow(q_party))
  rho_temp   <- rep(.5, length(orig_spkrs)) # neutral priors
  for(j in 1:J){
    # collect probability distribution over drawn phrases
    q_h_temp <- diag(q_H[, phrase_seq[, j]]) # spkr-row j has drawn phrase j in this construction
    q_v_temp <- diag(q_V[, phrase_seq[, j]]) 
    # expected posteriors
    rho[, j] <- (rho_temp * q_h_temp) / (rho_temp * q_h_temp + (1 - rho_temp) * q_v_temp)
    rho_temp <- rho[, j]
  }
  
  
  
  # partisanship given expected posteriors
  pi     <- party * rho + (1 - party) * (1 - rho)
  pi_avg  <- apply(pi, 2, mean)
  
  sess_name <- paste("pi_avg_sess", s, sep = "_")
  
  sess_name <- str_replace_all(sess_name, "-", "_")
  
  assign(sess_name, pi_avg)
  
  magnitude_list[[glue::glue("{s}")]] <- pi_avg
  

}


magnitude_list <- magnitude_list[!sapply(magnitude_list, is.null)]



saveRDS(magnitude_list, file = paste(dir, "magnitude_list_social_background.RDS", sep = "/"))


